library(readxl)
library(DT)
library(data.table)
library(dplyr)
library(ggplot2)
library(plotly)
library(cowplot)
library(ggrepel)
library(curl)
library(biomaRt)
library(sqldf)
# Ensembl LD API
library(httr)
library(jsonlite)
library(xml2)
library(gaston)
library(RCurl)
# *** susieR ****
# library(knitrBootstrap) #install_github('jimhester/knitrBootstrap')
library(susieR) # devtools::install_github("stephenslab/susieR")
# *** finemapr ****
## finemapr contains: finemap, CAVIAR, and PAINTOR
# library(finemapr) # devtools::install_github("variani/finemapr")
library(roxygen2) #roxygenize()
# *** locuscomparer ****
# https://github.com/boxiangliu/locuscomparer
library(locuscomparer); #devtools::install_github("boxiangliu/locuscomparer")
# thm <- knitr::knit_theme$get("bipolar")
# knitr::knit_theme$set(thm)
sessionInfo()## R version 3.5.1 (2018-07-02)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS 10.14.2
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRlapack.dylib
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] locuscomparer_1.0.0 roxygen2_6.1.1 susieR_0.6.2.0411
## [4] RCurl_1.95-4.11 bitops_1.0-6 gaston_1.5.4
## [7] RcppParallel_4.4.2 Rcpp_1.0.0 xml2_1.2.0
## [10] jsonlite_1.6 httr_1.4.0 sqldf_0.4-11
## [13] RSQLite_2.1.1 gsubfn_0.7 proto_1.0.0
## [16] biomaRt_2.38.0 curl_3.3 ggrepel_0.8.0
## [19] cowplot_0.9.4 plotly_4.8.0 ggplot2_3.1.0
## [22] dplyr_0.7.8 data.table_1.12.0 DT_0.5.1
## [25] readxl_1.2.0
##
## loaded via a namespace (and not attached):
## [1] Biobase_2.42.0 tidyr_0.8.2 bit64_0.9-7
## [4] viridisLite_0.3.0 assertthat_0.2.0 expm_0.999-3
## [7] stats4_3.5.1 blob_1.1.1 cellranger_1.1.0
## [10] yaml_2.2.0 progress_1.2.0 pillar_1.3.1
## [13] lattice_0.20-38 glue_1.3.0 chron_2.3-53
## [16] digest_0.6.18 colorspace_1.4-0 htmltools_0.3.6
## [19] Matrix_1.2-15 plyr_1.8.4 XML_3.98-1.16
## [22] pkgconfig_2.0.2 purrr_0.3.0 scales_1.0.0
## [25] tibble_2.0.1 IRanges_2.16.0 withr_2.1.2
## [28] BiocGenerics_0.28.0 lazyeval_0.2.1 magrittr_1.5
## [31] crayon_1.3.4 memoise_1.1.0 evaluate_0.12
## [34] tools_3.5.1 prettyunits_1.0.2 hms_0.4.2
## [37] stringr_1.3.1 S4Vectors_0.20.1 munsell_0.5.0
## [40] AnnotationDbi_1.44.0 bindrcpp_0.2.2 compiler_3.5.1
## [43] rlang_0.3.1 grid_3.5.1 htmlwidgets_1.3
## [46] tcltk_3.5.1 rmarkdown_1.11 gtable_0.2.0
## [49] DBI_1.0.0 R6_2.3.0 knitr_1.21
## [52] bit_1.1-14 bindr_0.1.1 commonmark_1.7
## [55] stringi_1.2.4 parallel_3.5.1 tidyselect_0.2.5
## [58] xfun_0.4
print(paste("susieR ", packageVersion("susieR")))## [1] "susieR 0.6.2.411"
createDT <- function(DF, caption="", scrollY=400){
data <- DT::datatable(DF, caption=caption,
extensions = 'Buttons',
options = list( dom = 'Bfrtip',
buttons = c('copy', 'csv', 'excel', 'pdf', 'print'),
scrollY = scrollY, scrollX=T, scrollCollapse = T, paging = F,
columnDefs = list(list(className = 'dt-center', targets = "_all"))
)
)
return(data)
}
createDT_html <- function(DF, caption="", scrollY=400){
print( htmltools::tagList( createDT(DF, caption, scrollY)) )
}
# https://stackoverflow.com/questions/42866055/rshiny-download-button-within-rmarkdown
# downloadHandler(filename = function() {
# return(paste('Example', input$SS, '.csv', sep=''))
#
# }, content = function(file) {
# write.csv(RandomSample(), file)
# })Use bioMart to get TSS positions for each gene
get_TSS_position <- function(gene){
mart <- useDataset("hsapiens_gene_ensembl", useMart("ensembl"))
# att <- listAttributes(mart)
# grep("transcription", att$name, value=TRUE)
TSS <- getBM(mart=mart,
attributes=c("hgnc_symbol","transcription_start_site", "version"),
filters="hgnc_symbol", values=gene)
} For each gene, get the position of top SNP (the one with the greatest effect size/Beta)
## Only the significant subset of results
Nalls_SS_sig <- read_excel("Data/Nalls2018_S2_SummaryStats.xlsx", sheet = "Data")
top_SNPs <- Nalls_SS_sig %>% group_by(`Nearest Gene`) %>% top_n(-1, `P, all studies`) #`Beta, all studies`
top_SNPs <- cbind(Coord=paste(top_SNPs$CHR,top_SNPs$BP, sep=":"), top_SNPs)
createDT(Nalls_SS_sig, caption = "Nalls (2018): Table S2. Detailed summary statistics on all nominated risk variants, known and novel")# Nall (2018) QTL Stats
# Nalls_QTL <- read_excel("Data/Nalls2018_S4_QTL-MR.xlsx")
# createDT(Nalls_QTL, caption = "Nalls (2018) Table S4. Expanded summary statistics for QTL Mendelian randomization") Get all genes surrounding the index SNP (default is 1Mb upstream + 1Mb downstream) Nalls et al. (2019) data: https://github.com/neurogenetics/meta5
get_flanking_SNPs <- function(gene, top_SNPs, bp_distance=1000000){ #1000000
topSNP_sub <- subset(top_SNPs, `Nearest Gene`==gene)
## Full dataset
filePath <- "Data/META.PD.NALLS2014.PRS.tsv"# "Data/nallsEtal2019_no23andMe.tab.txt"
# Get col names
# strsplit( readLines(filePath, n = 2), "\t")
minPos <- topSNP_sub$BP - bp_distance
maxPos <- topSNP_sub$BP + bp_distance
geneSubset <- read.csv.sql(filePath, sep="\t", stringsAsFactors=F,
sql = paste('select * from file where CHR =',topSNP_sub$CHR,
'AND POS BETWEEN', minPos, 'AND', maxPos))
geneSubset <- geneSubset %>% dplyr::rename(SNP=MarkerName)
return(geneSubset)
}
# flankingSNPs <- get_flanking_SNPs("LRRK2", top_SNPs)gaston_LD <- function(gene, flankingSNPs, reference="1KG_Phase1", superpopulation="EUR"){
# Download portion of vcf from 1KG website
region <- paste(flankingSNPs$CHR[1],":",min(flankingSNPs$POS),"-",max(flankingSNPs$POS), sep="")
chrom <- flankingSNPs$CHR[1]
if(reference=="1KG_Phase3"){
cat("LD Reference Panel = 1KG_Phase3 \n")
vcf_URL <- paste("ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/release/20130502/ALL.chr",chrom,
".phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz",sep="")
if(!exists("popDat_1KGphase3")){
popDat <- popDat_1KGphase3 <- read.delim("ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/release/20130502/integrated_call_samples_v3.20130502.ALL.panel", header = T, row.names = NULL)
colnames(popDat_1KGphase3) <- c("sample","population","superpopulation","gender")
} else{popDat <- popDat_1KGphase3}
} else if (reference=="1KG_Phase1") {
cat("LD Reference Panel = 1KG_Phase1 \n")
vcf_URL <- paste("ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/release/20110521/ALL.chr",chrom,
".phase1_release_v3.20101123.snps_indels_svs.genotypes.vcf.gz", sep="")
# Download individual-level metadata
if(!exists("popDat_1KGphase1")){
popDat <- popDat_1KGphase1 <- read.delim("ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/release/20110521/phase1_integrated_calls.20101123.ALL.panel", header = F, row.names = NULL)
colnames(popDat) <- c("sample","population","superpopulation","seq_platform1","seq_platform2")
} else { popDat <- popDat_1KGphase1 }
}
# library(Rsamtools); #BiocManager::install("Rsamtools")
system(paste("tabix -fh ",vcf_URL ,region, "> subset.vcf"))
vcf_name <- paste(basename(vcf_URL), ".tbi",sep="")
# Import w/ gaston and further subset
bed.file <- read.vcf("subset.vcf")
## Subset rsIDs
bed <- select.snps(bed.file, id %in% flankingSNPs$SNP)
# Subset Individuals
selectedInds <- subset(popDat, superpopulation %in% superpopulation)
bed <- select.inds(bed, id %in% selectedInds$sample)
# Cleanup extra files
rm(bed.file)
file.remove(vcf_name)
file.remove("subset.vcf")
# Calculate pairwise LD for all SNP combinations
ld.x <- gaston::LD(bed, lim = c(1,ncol(bed)))
# LD plot
limit <- 20
LD.plot( ld.x[1:limit,1:limit], snp.positions = bed@snps$pos[1:limit])
# Double check subsetting
LD_matrix <- ld.x[row.names(ld.x) %in% flankingSNPs$SNP, colnames(ld.x) %in% flankingSNPs$SNP]
return(LD_matrix)
}
# LD_matrix <- gaston_LD("LRRK2",flankingSNPs) susie_on_gene <- function(gene, top_SNPs, reference="1KG_Phase1"){
flankingSNPs <- get_flanking_SNPs(gene, top_SNPs)
### Get LD matrix
LD_matrix <- gaston_LD(gene, flankingSNPs, reference)
## Turn LD matrix into positive semi-definite matrix
# LD_matrix2 <- ifelse(matrixcalc::is.positive.semi.definite(LD_matrix),
# LDmatrix,
# Matrix::nearPD(LD_matrix)$mat %>% as.matrix() )
## Subset summary stats to only include SNPs found in query
geneSubset <- subset(flankingSNPs, SNP %in% row.names(LD_matrix))
b <- geneSubset$Effect
se <- geneSubset$StdErr
# Run Susie
fitted_bhat <- susie_bhat(bhat = b, shat = se,
R = LD_matrix,
n = nrow(LD_matrix),
L = 1, # 1
scaled_prior_variance = 0.1,
estimate_residual_variance = TRUE, # TRUE
estimate_prior_variance = FALSE, # FALSE
verbose = T,
# var_y = var(b),
standardize = TRUE
)
# Format results
geneSubset$Coord <- paste(geneSubset$CHR, geneSubset$POS, sep=":")
susieDF <- data.frame(SNP=names(fitted_bhat$X_column_scale_factors), PIP=fitted_bhat$pip) %>%
base::merge(subset(geneSubset, select=c("SNP","Effect","P.value", "POS","Coord")), by="SNP") %>%
mutate(POS=as.numeric(POS))
return(susieDF)
}
# susieDF <- susie_on_gene("LRRK2", top_SNPs) before_after_plots <- function(gene, susieDF, topVariants=3){
roundBreaks <- seq(plyr::round_any(min(susieDF$POS),10000), max(susieDF$POS),500000)
## Before fine-mapping
geneSubset <- susieDF %>% arrange(desc(abs(Effect)))
labelSNPs_Effect <- geneSubset[1:topVariants,]
yLimits <- c(min(-log(susieDF$Effect)), max(-log(susieDF$Effect))+1)
before_plot <- ggplot(geneSubset, aes(x=POS, y=Effect, label=SNP, color= -log(P.value) )) +
ylim(yLimits) +
geom_hline(yintercept=0,alpha=.5, linetype=1, size=.5) +
geom_point() +
geom_point(data=labelSNPs_Effect[1,], pch=21, fill=NA, size=4, colour="red", stroke=1) +
geom_segment(aes(xend=POS, yend=0, color= -log(P.value)), alpha=.5) +
geom_text_repel(data=labelSNPs_Effect, aes(label=SNP), segment.alpha = .2, nudge_y = .05, nudge_x = .5) +
labs(title=paste(gene," (",length(susieDF$PIP)," variants)","\nBefore Fine Mapping",sep=""), y="Beta", x="Position",
color="GWAS\n-log(p-value)") +
theme(plot.title = element_text(hjust = 0.5)) +
scale_x_continuous(breaks = roundBreaks)
## After fine-mapping
susieDF <- susieDF %>% arrange(desc(PIP))
labelSNPs_PIP <- susieDF[1:topVariants,]
yLimits <- c(min(susieDF$PIP), max(susieDF$PIP)+.1)
after_plot <- ggplot(susieDF, aes(x=POS, y=PIP, label=SNP, color= -log(P.value) )) +
ylim(yLimits) +
geom_hline(yintercept=0,alpha=.5, linetype=1, size=.5) +
geom_point() +
geom_point(data=susieDF[susieDF$PIP == max(susieDF$PIP),],
pch=21, fill=NA, size=4, colour="green", stroke=1) +
geom_segment(aes(xend=POS, yend=yLimits[1], color= -log(P.value))) +
geom_text_repel(data=labelSNPs_PIP, aes(label=SNP), segment.alpha = .2, nudge_y = .05, nudge_x = .5) +
labs(title=paste(gene," (",length(susieDF$PIP)," variants)","\nAfter Fine Mapping",sep=""), y="PIP", x="Position",
color="GWAS\n-log(p-value)") +
theme(plot.title = element_text(hjust = 0.5)) +
scale_x_continuous(breaks = roundBreaks)
plot_grid(before_plot, after_plot, nrow = 2) %>% print()
# susie_plot(fitted_bhat, y="PIP", b=b, add_bar = T)
createDT_html(susieDF, paste("susieR Results: ", gene), scrollY = 200)
}
# before_after_plots(susieDF)Nalls_vs_SusieR_overlap <- function(Nalls_SS_sig, susieDF, max_SNPs=10){
# Get top SNPs from Nalls et all fine mapping
Nalls_dat <- subset(Nalls_SS_sig, `Nearest Gene`==gene) %>%
arrange(`P, all studies`) %>%
mutate(SNP=as.character(SNP))
Nalls_SNPs <-if(max_SNPs > length(Nalls_dat$SNP)){ Nalls_dat$SNP}else{Nalls_dat$SNP[1:max_SNPs]}
# Get top SNPs from susieR fine mapping
susieR_dat <- subset(susieDF, PIP!=0) %>%
arrange(desc(PIP)) %>%
mutate(SNP=as.character(SNP))
susieR_SNPs <-if(max_SNPs > length(susieR_dat$SNP)){ susieR_dat$SNP}else{susieR_dat$SNP[1:max_SNPs]}
# Calculate percent
overlap <- length(intersect(Nalls_SNPs, susieR_SNPs))
percentOverlap <- overlap / length(susieR_SNPs) * 100
cat("\n",overlap,"/",length(susieR_SNPs), " (",round(percentOverlap,2),
"%) of SNPs identified by susieR are also identified by Nalls et al. (2018) fine mapping. ","\n",sep="")
}
# Nalls_vs_SusieR_overlap(Nalls_SS_sig, susieDF, max_SNPs=10)geneList <- c("LRRK2","GBAP1","SNCA","VPS13C","GCH1")#unique(top_SNPs$`Nearest Gene`)
#Nalls_SS %>% group_by(`Nearest Gene`) %>% tally() %>% subset(n>2)
fineMapped_topSNPs <- data.table()
for (gene in geneList){
cat("\n")
cat("### ", gene, "\n")
susieDF <- susie_on_gene(gene, top_SNPs, reference="1KG_Phase3")
before_after_plots(gene, susieDF)
Nalls_vs_SusieR_overlap(Nalls_SS_sig, susieDF, max_SNPs=10)
# Create summary table for all genes
newEntry <- cbind(data.table(Gene=gene), subset(susieDF, PIP==max(PIP)) %>% as.data.table())
fineMapped_topSNPs <- rbind(fineMapped_topSNPs, newEntry)
cat("\n")
}LD Reference Panel = 1KG_Phase3 ped stats and snps stats have been set. ‘p’ has been set. ‘mu’ and ‘sigma’ have been set. [1] “objective:-8024.60164396116” [1] “objective:-8024.57916398717” [1] “objective:-8024.57909314518” [1] “objective:-8024.57909292161”
## Warning in log(susieDF$Effect): NaNs produced
## Warning in log(susieDF$Effect): NaNs produced
## Warning: Removed 1 rows containing missing values (geom_hline).
0/10 (0%) of SNPs identified by susieR are also identified by Nalls et al. (2018) fine mapping.
LD Reference Panel = 1KG_Phase3 ped stats and snps stats have been set. ‘p’ has been set. ‘mu’ and ‘sigma’ have been set. [1] “objective:-4050.725081463” [1] “objective:-4050.52574123553” [1] “objective:-4050.52550707646” [1] “objective:-4050.52550680078”
## Warning in log(susieDF$Effect): NaNs produced
## Warning in log(susieDF$Effect): NaNs produced
## Warning: Removed 1 rows containing missing values (geom_hline).
0/10 (0%) of SNPs identified by susieR are also identified by Nalls et al. (2018) fine mapping.
LD Reference Panel = 1KG_Phase3 ped stats and snps stats have been set. ‘p’ has been set. ‘mu’ and ‘sigma’ have been set. [1] “objective:-6907.57999180984” [1] “objective:-6906.0851809898” [1] “objective:-6906.08461779126” [1] “objective:-6906.08461758221”
## Warning in log(susieDF$Effect): NaNs produced
## Warning in log(susieDF$Effect): NaNs produced
1/10 (10%) of SNPs identified by susieR are also identified by Nalls et al. (2018) fine mapping.
LD Reference Panel = 1KG_Phase3 ped stats and snps stats have been set. ‘p’ has been set. ‘mu’ and ‘sigma’ have been set. [1] “objective:-8005.70248923636” [1] “objective:-8005.68196375588” [1] “objective:-8005.68194489826” [1] “objective:-8005.68194488099”
## Warning in log(susieDF$Effect): NaNs produced
## Warning in log(susieDF$Effect): NaNs produced
## Warning: Removed 1 rows containing missing values (geom_hline).
0/10 (0%) of SNPs identified by susieR are also identified by Nalls et al. (2018) fine mapping.
LD Reference Panel = 1KG_Phase3 ped stats and snps stats have been set. ‘p’ has been set. ‘mu’ and ‘sigma’ have been set. [1] “objective:-6864.40734943593” [1] “objective:-6864.39131429051” [1] “objective:-6864.39127021551” [1] “objective:-6864.39127009453”
## Warning in log(susieDF$Effect): NaNs produced
## Warning in log(susieDF$Effect): NaNs produced
## Warning: Removed 1 rows containing missing values (geom_hline).
0/10 (0%) of SNPs identified by susieR are also identified by Nalls et al. (2018) fine mapping.
createDT(fineMapped_topSNPs, "Potential Causal SNPs Identified by susieR", scrollY = 200)## Warning in instance$preRenderHook(instance): It seems your data is too
## big for client-side DataTables. You may consider server-side processing:
## https://rstudio.github.io/DT/server.html